Sector Fund

For this project sector fund Fidelity Select Technology Portfolio (FSPTX) is chosen as the target fund. On Fidelity’s website, this is categorized as: Large Growth

For comparison, Russel2000(^RUT), NASDAQ(^IXIC), S&P500(^GSPC),S&P500ITsector,S&PNorthAmericaTechSector S&PMidCap(^MID) and S&PSmlCap(^SML) was selected as initial indexes to be campared with.

for comparison, Vanguard’s similar index funds are also loaded as IT ETF(VGT),LargeCap ETF(VIGAX) and TotalMarket ETF(VTSAX). In these ETFs VGT serves as MSCI US IM Info. Tech. 25/50, VIGAX as CRSP US Large Cap Growth Index and VTSAX serves asCRSP US Total Market Index.

Before loading data, we will define some useful function to ease the data cleaning process, and claim some variables 1st.

## this function requires a dataframe input that has the daily close price named:Closed and a date column named: Date with format as: floating point and "xxxx(year)-xx(month)-xx(day)"
## We trimmed the data from 1990-10-07 because 1990-10-08 is a Monday and Stock market closed during weekend
dailynlogReturn <- function(Date1,DataFrame){
  DataFrame = mutate(DataFrame, 
                     dailyReturn = (Close-lag(Close))/Close,
                     log.Close = log(Close),
                     log.Return = log.Close-lag(log.Close))%>%
    mutate(perc_dailyRe = round(dailyReturn*100.0,3))%>%
    filter(Date >= Date1)%>%
    filter(Date <= as.Date("2018-12-31"))
}
## This function returns a projection value of the fund from the start date and assuming 10k investment from the start and reinvest all earnings
getProjectionValue <- function(DF){
  P0 = pull(filter(DF, Date == pull(top_n(DF["Date"],-1)))%>%select(Close))
  DF = mutate(DF,ProjValper10k = (Close*10000)/P0)
}
## Calculate Euclidean distances between two sets of data
sqerr <- function(x,y){
  z = x - y
  z = sqrt(dot(z,z)/length(y))
  return(z)
}

## function for standarize NAV
standardizedNAV = function(DF){
  return(mutate(DF,Close.z = (Close-mean(Close))/sd(Close)))
}

## restrict ourselves to study data after 2014-01-01
StartDate = as.Date("2014-01-01")

Load and clean the data.

Load the data:

FSPTX = dailynlogReturn(StartDate,read_csv("FSPTX.csv"))
NASDAQ = dailynlogReturn(StartDate,read_csv("^IXIC.csv"))
SnP500 = dailynlogReturn(StartDate,read_csv("^GSPC.csv"))
SnPMID = dailynlogReturn(StartDate,read_csv("^MID.csv"))
SnPSML = dailynlogReturn(StartDate,read_csv("^SML.csv"))
RUSSELL2000 = dailynlogReturn(StartDate,read_csv("^RUT.csv"))
VGT = dailynlogReturn(StartDate,read_csv("VGT.csv"))
VIGAX = dailynlogReturn(StartDate,read_csv("VIGAX.csv"))
VTSAX = dailynlogReturn(StartDate,read_csv("VTSAX.csv"))
SnP500Info <- dailynlogReturn(StartDate,read_csv("SnP500Info.csv"))
SnPNATech <- dailynlogReturn(StartDate,read_csv("SnPNATECH_clean.csv"))

Pick Dates where tax and dividen happens

Check DailyReturn Anormlies and set them to NA value

## Compare daily returns
dailyReturnComp = cbind(as.Date(FSPTX$Date),FSPTX$dailyReturn,NASDAQ$dailyReturn,SnP500$dailyReturn,VGT$dailyReturn,VIGAX$dailyReturn,VTSAX$dailyReturn)
colnames(dailyReturnComp) = c("Date","FSPTX","NASDAQ","SnP500","VGT","VIGAX","VTSAX")
epsilon = 0.000000000000000001
dailyReturnComp = data.frame(dailyReturnComp)%>%mutate(Date = as_date(Date),vsNASDAQ = ifelse(NASDAQ*NASDAQ<=epsilon,FSPTX,FSPTX/NASDAQ),vsSnP500 = ifelse(SnP500*SnP500<=epsilon,FSPTX,FSPTX/SnP500),vsVGT = ifelse(VGT*VGT<=epsilon,FSPTX,FSPTX/VGT),vsVIGAX = ifelse(VIGAX*VIGAX<=epsilon,FSPTX,FSPTX/VIGAX),vsVTSAX = ifelse(VTSAX*VTSAX<=epsilon,FSPTX,FSPTX/VTSAX))%>%mutate(minusNASDAQ = FSPTX-NASDAQ,minusSnP500 = FSPTX-SnP500,minusVGT = FSPTX-VGT,minusVIGAX= FSPTX-VIGAX,minusVTSAX = FSPTX-VTSAX)

plotly::plotly_build(ggplot(dailyReturnComp)+
               aes(x = Date,y = minusNASDAQ)+geom_point(alpha = .1)+
               geom_smooth(method = "loess",se = TRUE))
DividenDates <- dailyReturnComp%>%dplyr::filter(minusNASDAQ < -0.025)%>%dplyr::select(Date)%>%pull()



FSPTX <-  FSPTX%>%dplyr::filter(!Date %in% DividenDates)
NASDAQ  <- NASDAQ%>%dplyr::filter(!Date %in% DividenDates)
SnP500  <- SnP500%>%dplyr::filter(!Date %in% DividenDates)
SnPMID <- SnPMID%>%dplyr::filter(!Date %in% DividenDates)
SnPSML  <- SnPSML%>%dplyr::filter(!Date %in% DividenDates)
RUSSELL2000  <- RUSSELL2000%>%dplyr::filter(!Date %in% DividenDates)
VGT  <- VGT%>%dplyr::filter(!Date %in% DividenDates)
VIGAX  <- VIGAX%>%dplyr::filter(!Date %in% DividenDates)
VTSAX  <- VTSAX%>%dplyr::filter(!Date %in% DividenDates)
SnP500Info <- SnP500Info%>%dplyr::filter(!Date %in% DividenDates)
SnPNATech <- SnPNATech%>%dplyr::filter(!Date %in% DividenDates)

Check Correlation Map of log return

DailyReturncor = cbind(FSPTX$log.Return,
                       NASDAQ$log.Return,
                       RUSSELL2000$log.Return,
                       SnP500$log.Return,
                       SnP500Info$log.Return,
                       SnPNATech$log.Return,
                       SnPMID$log.Return,
                       SnPSML$log.Return,
                       VGT$log.Return,
                       VIGAX$log.Return,
                       VTSAX$log.Return)

colnames(DailyReturncor) = c("FSPTX",
                             "NASDAQ",
                             "RUSSELL2000",
                             "SnP500",
                             "SnP500Info",
                             "SnPNATech",
                             "SnPMID",
                             "SnPSML",
                             "VGT(IT ETF)",
                             "VIGAX(LargeCAP)",
                             "VTSAX(TotalMarket)")

DailyReturncor = data.frame(DailyReturncor)

#print("DailyReturn Correlation")
#cor(DailyReturncor,DailyReturncor)

colmat <- colorRampPalette(c("red", "white", "blue"))
corrplot::corrplot(cor(DailyReturncor,DailyReturncor),cl.lim = c(0.6,1.0),is.corr = FALSE,col = colmat(100),title = "Daily Log Return cor",type = "lower",tl.cex = .8,mar=c(1,1,2,1))

logreturncor <- data.frame(cor(DailyReturncor,DailyReturncor))
logreturncor%>%dplyr::select(1)

The best matches according to daily log return is: SnPNATech,SnP500Info and NASDAQ. Besides, VGT.IT.ETF also is highly correlated in terms of daily log return.

Try linear regression on log returns

logreturnmodel <- lm(FSPTX$log.Return~NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return - 1)

summary(logreturnmodel)
## 
## Call:
## lm(formula = FSPTX$log.Return ~ NASDAQ$log.Return + SnP500Info$log.Return + 
##     SnPNATech$log.Return + VGT$log.Return - 1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0186969 -0.0016586 -0.0000369  0.0016204  0.0115334 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## NASDAQ$log.Return      0.23270    0.03739   6.223 6.64e-10 ***
## SnP500Info$log.Return -0.61227    0.06703  -9.134  < 2e-16 ***
## SnPNATech$log.Return   0.74973    0.06231  12.032  < 2e-16 ***
## VGT$log.Return         0.65983    0.07663   8.611  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002829 on 1247 degrees of freedom
## Multiple R-squared:  0.9378, Adjusted R-squared:  0.9376 
## F-statistic:  4698 on 4 and 1247 DF,  p-value: < 2.2e-16
plotly::plotly_build(ggplot(logreturnmodel)+
  aes(x = .fitted,y = .stdresid)+geom_point()+
  geom_abline(intercept = 2.0,slope = 0.0, linetype = "dashed")+
  geom_abline(intercept = -2.0,slope = 0.0, linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Fitted Values")+
  ggtitle("Standardized Residual Plot"))
#plot(logreturnmodel,which = 2)

plotly::plotly_build(ggplot(logreturnmodel)+
  aes(sample = .stdresid)+
  stat_qq() + stat_qq_line(linetype = "dashed")+
  ylab("Standardized Residuals")+
  xlab("Theoretical")+
  ggtitle("QQnorm Plot"))

Save the predicted value as the index composit

regressiontable <- cbind(as.character.Date(FSPTX$Date),as.numeric(FSPTX$log.Return),as.numeric(logreturnmodel$fitted.values))
colnames(regressiontable) <- c("Date","FSPTX","composit")
regressiontable <- data.frame(regressiontable)
regressiontable <- regressiontable%>%mutate(Date = as.Date(Date),FSPTX = as.numeric(as.character(FSPTX)),composit = as.numeric(as.character(composit)))

regressiontable <-regressiontable%>%mutate(direction = if_else(FSPTX*composit > 0, 1, 0))

ggplot(regressiontable)+aes(x = composit, y = direction)+geom_point()

directionmodel <- glm(data = regressiontable, direction~composit,family = binomial(link = "logit"))
summary(directionmodel)
## 
## Call:
## glm(formula = direction ~ composit, family = binomial(link = "logit"), 
##     data = regressiontable)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1763   0.4424   0.4473   0.4524   0.4878  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.24212    0.09603   23.35   <2e-16 ***
## composit     3.47735    8.69290    0.40    0.689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 790.71  on 1250  degrees of freedom
## Residual deviance: 790.55  on 1249  degrees of freedom
## AIC: 794.55
## 
## Number of Fisher Scoring iterations: 5
fitted.y = fitted(directionmodel); observed.y= regressiontable$direction
perf<- ROCR::performance(ROCR::prediction(fitted.y,observed.y) ,"tpr","fpr")
ROCR::plot(perf); abline(0,1,lty=2)

binnedplot(predict(directionmodel),resid(directionmodel))

plotly::plotly_build(ggplot(regressiontable)+aes(x = FSPTX-composit)+geom_histogram(bins = 70,aes(y = ..density..),alpha = .5)+geom_density()+xlab("log return of FSPTX - log return of composit"))